This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
Task: Try to estimate the price based on given features.

The term Boosting refers to a family of algorithms which converts weak learner to strong learners.
There are many boosting algorithms which impart additional boost to model’s accuracy. In this tutorial, we’ll learn about the two most commonly used algorithms i.e. Gradient Boosting (GBM) and XGboost.
Generally XGboost is considered more advanced than gbm.
import sys
ENV_BHISHAN = 'bhishan' in sys.modules
if ENV_BHISHAN:
print('Environment: Personal environment')
import src
import bhishan
%load_ext autoreload
%autoreload 2
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
#!pip install hpsklearn
!pip install shap
# set OMP_NUM_THREADS=1 for hpsklearn package
#!export OMP_NUM_THREADS=1
print('Environment: Google Colab')
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
# random state
SEED=100
np.random.seed(SEED) # we need this in each cell
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 200)
# pd.set_option('display.float_format', '{:,.4f}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import scipy
import sklearn
print([(x.__name__,x.__version__) for x in [scipy, sklearn]])
# scale and split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# pipeline
from sklearn.pipeline import Pipeline
# classifier
import xgboost as xgb
# six and pickle
import six
import pickle
import joblib
# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
# cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
df_eval = pd.DataFrame({'Model': [],
'Details':[],
'Train Neg MSE K-Fold Cross Validation':[],
'Test RMSE':[],
'Test Explained Variance Score':[],
'Test R-squared':[],
'Test Adjusted R-squared':[],
})
# model evaluation using snap
import shap
# shap_values = shap.TreeExplainer(model_xgb).shap_values(Xtest)
# shap.summary_plot(shap_values, Xtest)
# shap.dependence_plot("column_name", shap_values, Xtest)
def show_method_attributes(obj, ncols=7,start=None, inside=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
print(f'Object Type: {type(obj)}\n')
lst = [elem for elem in dir(obj) if elem[0]!='_' ]
lst = [elem for elem in lst
if elem not in 'os np pd sys time psycopg2'.split() ]
if isinstance(start,str):
lst = [elem for elem in lst if elem.startswith(start)]
if isinstance(start,tuple) or isinstance(start,list):
lst = [elem for elem in lst for start_elem in start
if elem.startswith(start_elem)]
if isinstance(inside,str):
lst = [elem for elem in lst if inside in elem]
if isinstance(inside,tuple) or isinstance(inside,list):
lst = [elem for elem in lst for inside_elem in inside
if inside_elem in elem]
return pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
def adjustedR2(rsquared,nrows,kcols):
"""
Adjusted r-squared depends on number of rows and columns of Test data.
It reduces the value of original r-squared value.
"""
return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
# df_clean = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
df_clean = pd.read_csv('https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/processed/data_cleaned_encoded.csv?raw=true')
print(df_clean.shape)
df_clean.head()
# I will just take column names from this and will use cleaned data further.
# df_raw = pd.read_csv('../data/raw/kc_house_data.csv')
df_raw = pd.read_csv('https://github.com/bhishanpdl/Project_House_Price_Prediction/blob/master/data/raw/kc_house_data.csv?raw=true',nrows=1)
df_raw.columns
features_raw_all = ['bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15']
df = df_clean[features_raw_all + ['price']]
dict_features = dict(enumerate(features_raw_all))
print(dict_features)
log_cols = ['price','sqft_living','sqft_living15','sqft_lot','sqft_lot15']
for col in log_cols:
df[col] = np.log1p(df[col].to_numpy())
X = df[features_raw_all].to_numpy()
y = df['price'].to_numpy()
Xtrain_orig, Xtest, ytrain_orig, ytest = train_test_split (X,y,
test_size=0.20,
random_state=SEED)
df_Xtrain_orig = pd.DataFrame(Xtrain_orig, columns=features_raw_all)
df_Xtest = pd.DataFrame(Xtest, columns=features_raw_all)
Xtrain_orig.shape, ytrain_orig.shape, Xtest.shape, ytest.shape, Xtrain_orig[0][:2], Xtest[0][:2]
Xtrain, Xvalid, ytrain, yvalid = train_test_split(Xtrain_orig, ytrain_orig,
random_state=SEED, test_size=0.2)
df_Xtrain = pd.DataFrame(Xtrain, columns=features_raw_all)
df_Xvalid = pd.DataFrame(Xvalid, columns=features_raw_all)
scaler = StandardScaler() # standard scaler better for regression
scaler.fit(Xtrain_orig)
Xtrain_orig_scaled = scaler.transform(Xtrain_orig)
Xtest_scaled = scaler.transform(Xtest)
Xtrain_scaled = scaler.transform(Xtrain)
Xvalid_scaled = scaler.transform(Xvalid)
df_Xtrain_orig_scaled = pd.DataFrame(Xtrain_orig_scaled, columns=features_raw_all)
df_Xtrain_scaled = pd.DataFrame(Xtrain_scaled, columns=features_raw_all)
df_Xtest_scaled = pd.DataFrame(Xtest_scaled, columns=features_raw_all)
df_Xvalid_scaled = pd.DataFrame(Xvalid_scaled, columns=features_raw_all)
Xtrain_scaled[0][:2], Xtest_scaled[0][:2]
https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters
Parameters:
-------------
max_depth=3
learning_rate=0.1
n_estimators=100 # number of trees you want to build.
verbosity=1 **NOTE: it print in ipython terminal not in browser
silent=None **deprecated use verbosity
objective='binary:logistic' **for binary classification
booster='gbtree' **use default tree not linear even for regression (may also use dart instead of gbtree, but needs to tune)
n_jobs=1 **make this -1
nthread=None **deprecated use n_jobs
gamma=0 # A higher value leads to fewer splits.
min_child_weight=1
max_delta_step=0
subsample=1 # percentage of samples used per tree. Low value can lead to underfitting.
colsample_bytree=1 # percentage of features used per tree. High value can lead to overfitting.
colsample_bylevel=1
colsample_bynode=1
reg_alpha=0 # A large value leads to more regularization.
reg_lambda=1 # L2 regularization on leaf weights and is smoother than L1 regularization.
scale_pos_weight=1
base_score=0.5
random_state=0 **use your own random state
seed=None **deprecated use random_state
missing=None
If you have a validation set, you can use early stopping to find the optimal number of boosting rounds. Early stopping requires at least one set in evals. If there’s more than one, it will use the last.
train(..., evals=evals, early_stopping_rounds=10)
The model will train until the validation score stops improving. Validation error needs to decrease at least every early_stopping_rounds to continue training.
If early stopping occurs, the model will have three additional fields: bst.best_score, bst.best_iteration and bst.best_ntree_limit. Note that xgboost.train() will return a model from the last iteration, not the best one.
This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC). Note that if you specify more than one evaluation metric the last one in param['eval_metric'] is used for early stopping.
If early stopping is enabled during training, you can get predictions from the best iteration with bst.best_ntree_limit:
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
Flexible tree growing policies The existing tree grower in xgboost grows a tree in a depth-wise fashion, executing splits in first level before splits in second and so forth. The new grower lets you control the way new nodes are added to the tree:
grow_policy=depthwise (default): split at nodes closest to the root, i.e. grow depth-wise. grow_policy=lossguide: split at nodes with highest loss change. This behavior mimics that of LightGBM. It has been reported that the lossguide policy often results in faster convergence in loss, though there is also risk of over-fitting(see the preliminary results).
import xgboost as xgb
show_method_attributes(xgb)
show_method_attributes(xgb.plotting)
# help(xgb.XGBRegressor)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
# time
time_start = time.time()
# current parameters
Xtr = Xtrain_scaled
ytr = ytrain
Xtx = Xtest_scaled
ytx = ytest
# fit the model
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror')
model_xgb.fit(Xtr, ytr)
# fitted model
model = model_xgb
# save the model
# joblib.dump(model_xgb, 'model_xgb.pkl')
# model_xgb = joblib.load('model_xgb.pkl')
# ypreds
kf=KFold(n_splits=5,shuffle=True,random_state=SEED)
ypreds = cross_val_predict(model, Xtx, ytx, cv=kf)
# train validation
cvs = cross_val_score(model, Xtr, ytr,cv=kf,
scoring = "neg_mean_squared_error")
score = cvs.mean()
# rmse
rmse = np.sqrt(sklearn.metrics.mean_squared_error(ytx,ypreds))
# expalined variance
evs = explained_variance_score(ytx, ypreds)
# r-squared values
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
row_eval = ['Xgboost',
'default,log+standard scaling',
score,rmse,evs,r2,ar2]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
# results
df_eval
Note:
model.feature_importance gives important features based on information type gain.
xgb.plot_feature_importance(model) gives important features based on weights.
show_method_attributes(model_xgb)
# feature importance
df_imp = pd.DataFrame({'Feature': features_raw_all,
'Importance_gain': model_xgb.feature_importances_
})
df_imp.sort_values('Importance_gain').style.background_gradient()
df_imp.sort_values('Importance_gain').set_index('Feature')\
.sort_values('Importance_gain').plot.barh(figsize=(12,8))
axsub = xgb.plot_importance(model_xgb)
print(features_raw_all)
print(dict_features)
axsub = xgb.plot_importance(model_xgb)
Text_yticklabels = list(axsub.get_yticklabels())
dict_features = dict(enumerate(features_raw_all))
lst_yticklabels = [ Text_yticklabels[i].get_text().lstrip('f') for i in range(len(Text_yticklabels))]
lst_yticklabels = [ dict_features[int(i)] for i in lst_yticklabels]
axsub.set_yticklabels(lst_yticklabels)
print(dict_features)
plt.show()
def plot_feature_imp_xgb(model, numpy=False,dict_features=None):
"""Plot the feature importance horizontal bar plot for xgboost model.
Note:
----
If you have used numpy arrays in xgboost:
plot_feature_imp_xgb(model_xgb, True, dict_features)
If you have used pandas dataframes in xgboost:
plot_feature_imp_xgb(model_xgb)
"""
dict_fimp = model_xgb.get_booster().get_score(importance_type="weight")
# feature importance
df_imp = pd.DataFrame({'Features': list(dict_fimp.keys()),
'Importance_weight': list(dict_fimp.values())
})
if numpy:
assert dict_features is not None
df_imp['Features'] = df_imp['Features'].str.lstrip('f').astype(int).map(dict_features)
df_imp = df_imp.sort_values('Importance_weight').set_index('Features')
ax = df_imp.plot.barh(figsize=(12,8))
plt.grid(True)
plt.xlabel('F1-score')
plt.title('Feature Importtance based on weights',fontsize=14)
ax.get_legend().remove()
for p in ax.patches:
x = p.get_width()*1.01
y = p.get_y()+ 0.2
text = '{:.0f}'.format(p.get_width())
ax.text(x, y,text,fontsize=15,color='indigo')
plt.show()
plot_feature_imp_xgb(model_xgb, True, dict_features)
# help(xgb.DMatrix)
dtrain = xgb.DMatrix(Xtrain_scaled,ytrain,
feature_names=features_raw_all)
params = {"objective":"reg:squarederror",
'colsample_bytree': 0.3,
'learning_rate': 0.1,
'max_depth': 5,
'alpha': 10}
# help(xgb.cv)
num_boost_round=500
kf=KFold(n_splits=5,shuffle=True,random_state=SEED)
cv_results = xgb.cv(params,dtrain, num_boost_round,
nfold=5,
early_stopping_rounds=50,
metrics="rmse",
folds=kf,
verbose_eval=50, # show progress at Nth iteration
seed=SEED)
cv_results.head()
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
# time
time_start = time.time()
# current parameters
Xtr = Xtrain_scaled
ytr = ytrain
Xtx = Xtest_scaled
ytx = ytest
# get boosting data
dtrain = xgb.DMatrix(Xtr,ytr,feature_names=features_raw_all)
dtest = xgb.DMatrix(Xtx,feature_names=features_raw_all)
# boosting params
params = {"objective":"reg:squarederror",
'colsample_bytree': 0.3,
'learning_rate': 0.1,
'max_depth': 5,
'alpha': 10,
'tree_method': 'hist',
'grow_policy': 'depthwise', # depthwise, lossguide
}
# train the model
num_boost_round= 10 # eg. take 1000
bst = xgb.train(params,dtrain,num_boost_round) # this is a booster object
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
bst
show_method_attributes(bst) # boosting object
show_method_attributes(model_xgb) # scikit learn style regressor model
bst.best_iteration, bst.get_score(), bst.get_fscore()
# help(xgb.train)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
# time
time_start = time.time()
# current parameters
Xtr = Xtrain_scaled
ytr = ytrain
Xtx = Xtest_scaled
ytx = ytest
Xtv = Xvalid_scaled
ytv = yvalid
# get boosting data
dtrain = xgb.DMatrix(Xtr,ytr,feature_names=features_raw_all)
dtest = xgb.DMatrix(Xtx,ytx,feature_names=features_raw_all)
dvalid = xgb.DMatrix(Xtv,ytv,feature_names=features_raw_all)
# boosting params
params = {"objective":"reg:squarederror",
'colsample_bytree': 0.3,
'learning_rate': 0.1,
'max_depth': 5,
'alpha': 10}
# watch list
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
# train the model
num_boost_round= 100 # eg. take 1000
bst = xgb.train(params,dtrain,num_boost_round,
evals=watchlist,
verbose_eval=10
)
# save the model
# joblib.dump(bst, 'bst_xgb.pkl')
# bst = joblib.load('bst_xgb.pkl')
# ypreds
ypreds = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
# train validation
kf=KFold(n_splits=5,shuffle=True,random_state=SEED)
cv_results = xgb.cv(params,dtrain, num_boost_round,
nfold=5,
early_stopping_rounds=50,
metrics="rmse", # gives train-rmse, test-rmse mean and std
folds=kf,
verbose_eval=50, # show progress at Nth iteration
seed=SEED)
score = cv_results['train-rmse-mean'].mean()
# rmse
rmse = np.sqrt(sklearn.metrics.mean_squared_error(ytx,ypreds))
# expalined variance
evs = explained_variance_score(ytx, ypreds)
# r-squared values
r2 = sklearn.metrics.r2_score(ytx, ypreds)
ar2 = adjustedR2(r2, Xtx.shape[0], Xtx.shape[1])
row_eval = ['Xgboost',
'default,log+standard scaling,using dtrain and dtest',
score,rmse,evs,r2,ar2]
df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
# time
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
# results
df_eval
xgb.plot_importance(bst)
fig,ax = plt.subplots(figsize=(12,8))
xgb.plot_tree(bst,ax=ax,num_trees=4)
import shap
explainer = shap.TreeExplainer(model_xgb)
shap_values = explainer.shap_values(Xtest_scaled)
# load JS visualization code to notebook
shap.initjs()
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:],
df_Xtest_scaled.iloc[0,:],
matplotlib=False,
text_rotation=90)
# load JS visualization code to notebook
shap.initjs()
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, df_Xtest_scaled,matplotlib=False)
Xtest_scaled.shape, shap_values.shape
shap.summary_plot(shap_values, df_Xtest_scaled)
shap.summary_plot(shap_values, df_Xtest_scaled, plot_type='bar')
shap.dependence_plot("sqft_living", shap_values, df_Xtest_scaled)
shap.dependence_plot("view", shap_values, df_Xtest_scaled)
We generally should optimize model complexity and then tune the convergence.
model complexity: n_estimators, max_depth etc
convergence: learning rate
Parameters:
Regularization parameters:
# Baseline model
time_start = time.time()
# model fit
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror')
model_xgb.fit(df_Xtrain_scaled, ytrain)
default = """
ar2 = 0.8768138697750008
"""
# model_xgb.fit(df_Xtrain_scaled,ytrain,
# eval_set=[(df_Xvalid_scaled,yvalid)],
# eval_metric='rmse', # auc for classification
# early_stopping_rounds=10, # early stopping gives
# verbose = 50,
# )
# early30 = """
# ar2 = 0.8768138697750008
# """
# predictions
ypreds = model_xgb.predict(df_Xtest_scaled)
# r-squared values
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print(ar2)
model_xgb
n = 1200
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=n,
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print(n, round(ar2,4))
results = \
"""
1100 0.9084
1200 0.9088
1300 0.9086
1400 0.9087
1500 0.9086
1600 0.9087
1700 0.9086
1800 0.9084
1900 0.9082
2000 0.9081
1160 0.9087
1170 0.9087
1180 0.9087
1190 0.9088
1200 0.9088
1210 0.9087
1220 0.9087
"""
for n in np.arange(3,4):
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=n,
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print(n, round(ar2,4))
"""
1 0.8744
2 0.9031
3 0.9088
4 0.9078
5 0.9058
6 0.9045
7 0.9032
""";
model_xgb
for n in [1]:
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=n,
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print(n, round(ar2,4))
"""
for n in np.geomspace(0.001,1,10)
0.0001 0.9083
0.0002782559402207126 0.9083
0.000774263682681127 0.9081
0.002154434690031882 0.9093
0.005994842503189409 0.9082
0.016681005372000592 0.9084
0.046415888336127774 0.908
0.12915496650148828 0.9093
0.3593813663804626 0.9095
1.0 0.9104
""";
model_xgb
for n in np.arange(1,20):
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=n,
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print( round(n,4), round(ar2,4))
"""
np.geomspace(1,1000,10)
1.0 0.9104
2.1544 0.9098
4.6416 0.9101
10.0 0.9097
21.5443 0.9101
46.4159 0.909
100.0 0.9082
215.4435 0.9076
464.1589 0.9042
1000.0 0.9004
np.arange(1,20)
1 0.9104
2 0.9104
3 0.9098
4 0.9101
5 0.9111 ** best
6 0.9111
7 0.9097
8 0.9099
9 0.9099
10 0.9097
11 0.9105
12 0.9105
13 0.9101
14 0.9099
15 0.9093
16 0.9111
17 0.9103
18 0.9101
19 0.9101
""";
model_xgb
for n in [1]:
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=5,
subsample=n
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print( round(n,4), round(ar2,4))
"""
np.geomspace(0.001,1,10)
0.001 0.8226
0.0022 0.8463
0.0046 0.8657
0.01 0.8738
0.0215 0.8885
0.0464 0.8963
0.1 0.9017
0.2154 0.904
0.4642 0.9085
1.0 0.9111
np.arange(0.8,1.0,0.02)
0.8 0.9093
0.82 0.911
0.84 0.9104
0.86 0.9096
0.88 0.9095
0.9 0.9103
0.92 0.91
0.94 0.9102
0.96 0.9097
0.98 0.9104
np.arange(0.98,1.0,0.001)
0.98 0.9104
0.981 0.9099
0.982 0.9102
0.983 0.9101
0.984 0.91
0.985 0.9099
0.986 0.9105
0.987 0.9098
0.988 0.9103
0.989 0.9107
0.99 0.9095
0.991 0.9101
0.992 0.9109
0.993 0.9101
0.994 0.9103
0.995 0.9105
0.996 0.9103
0.997 0.9103
0.998 0.9107
0.999 0.9104
1.0 0.9111
""";
model_xgb
for n in [0]:
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=5,
subsample=1,
gamma=n
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print( round(n,4), round(ar2,4))
"""
0.0 0.9111
0.1111 0.8972
0.2222 0.8903
0.3333 0.8871
0.4444 0.8829
0.5556 0.8804
0.6667 0.8787
0.7778 0.8775
0.8889 0.8758
1.0 0.8728
""";
model_xgb
for n in [1]:
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=5,
subsample=1,
gamma=0,
min_child_weight=n,
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print( round(n,4), round(ar2,4))
"""
0.0 0.9111
0.1111 0.9111
0.2222 0.9111
0.3333 0.9111
0.4444 0.9111
0.5556 0.9111
0.6667 0.9111
0.7778 0.9111
0.8889 0.9111
1.0 0.9111
""";
model_xgb
for n in [1]:
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=5,
subsample=1,
gamma=0,
min_child_weight=1,
colsample_bytree=n
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print( round(n,4), round(ar2,4))
"""
0.0 0.8875
0.1111 0.9021
0.2222 0.9087
0.3333 0.9105
0.4444 0.9107
0.5556 0.9111
0.6667 0.9107
0.7778 0.9097
0.8889 0.9101
1.0 0.9111
0.9 0.9101
0.9111 0.9101
0.9222 0.9101
0.9333 0.9101
0.9444 0.9107
0.9556 0.9107
0.9667 0.9107
0.9778 0.9107
0.9889 0.9107
1.0 0.9111
""";
for n in [0.1]:
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=5,
subsample=1,
gamma=0,
min_child_weight=1,
colsample_bytree=1,
learning_rate=n
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print( round(n,4), round(ar2,4))
"""
0.0001 -442.5797
0.001 -50.6359
0.1 0.9111
0.2 0.9086
0.3 0.9072
0.4 0.9036
""";
# import xgboost as xgb
# from sklearn.model_selection import GridSearchCV
# params_grid = {
# 'colsample_bytree':[0.4,0.6,0.8],
# 'gamma':[0,0.03,0.1,0.3],
# 'min_child_weight':[1.5,6,10],
# 'learning_rate':[0.1,0.07],
# 'max_depth':[3,5],
# 'n_estimators':[10000],
# 'reg_alpha':[1e-5, 1e-2, 0.75],
# 'reg_lambda':[1e-5, 1e-2, 0.45],
# 'subsample':[0.6,0.95]
# }
# model_xgb = xgb.XGBRegressor(learning_rate =0.1,
# n_estimators=1000,
# max_depth=5,
# min_child_weight=1,
# gamma=0,
# subsample=0.8,
# colsample_bytree=0.8,
# n_jobs=-1,
# scale_pos_weight=1,
# seed=RANDOM_STATE)
# gsearch1 = GridSearchCV(estimator=model_xgb,
# param_grid=params_grid,
# n_jobs=-1,
# iid=False,
# verbose=10,
# scoring='neg_mean_squared_error')
# gsearch1.fit(Xtrain_scaled,ytrain)
# print (gsearch1.grid_scores_)
# print('best params')
# print (gsearch1.best_params_)
# print('best score')
# print (gsearch1.best_score_)
model_xgb = xgb.XGBRegressor(n_jobs=-1, random_state=SEED,
objective='reg:squarederror',
n_estimators=1200,
max_depth=3,
reg_alpha=1,
reg_lambda=5,
subsample=1,
gamma=0,
min_child_weight=1,
colsample_bytree=1,
learning_rate=0.1
)
model_xgb.fit(df_Xtrain_scaled, ytrain)
ypreds = model_xgb.predict(df_Xtest_scaled)
r2 = sklearn.metrics.r2_score(ytest, ypreds)
ar2 = adjustedR2(r2, df_Xtrain_scaled.shape[0], df_Xtrain_scaled.shape[1])
print(round(ar2,4))
# feature importance
fig,ax = plt.subplots(figsize=(12,8))
xgb.plot_importance(model_xgb,ax=ax)
plt.show()
plot_feature_imp_xgb(model_xgb)